# **MODIFY THIS CHUNK**
library(here)
kundaje_dir    <- trimws(readr::read_lines("../../AK_PROJ_DIR.txt"))
doc_id         <- "04c"
out            <- here( "output/03-chrombpnet/03-syntax/", doc_id); dir.create(out, recursive = TRUE)
figout         <- here( "figures/03-chrombpnet/03-syntax", doc_id, "/"); dir.create(figout, recursive = TRUE)
chrombpnet_dir <- here( "output/03-chrombpnet")

1 Overview

Here, we load the results of in silico marginalizations to assess motif cooperativity/synergy and syntax constraints.

We will assign motif pairs as having synergy or not, and having hard or soft syntax (or both or neither).

2 Set up

library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(scales)
library(glue)
library(purrr)
library(stringr)
library(ggrepel)
library(cowplot)
library(pheatmap)
library(ggseqlogo)
library(universalmotif)
library(TFBSTools)

# for interactive stuff
library(plotly)
library(reactable)
library(shiny)

script_path <- here( "code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "chrombpnet_utils.R"))

ggplot2::theme_set(theme_BOR() + theme(strip.background = element_blank()))

3 Load data

3.1 Motif data

# load hits
hits_all <- readRDS(file.path(chrombpnet_dir, "03-syntax/01/hits.Rds")) %>% bind_rows()

# load intermediate objects
load(file.path(chrombpnet_dir, "03-syntax/01/intermediate_obj.Rda"))

Load merged modisco reports:

cwm_means <- read_tsv(here("output/03-chrombpnet/02-compendium/modisco_compiled/cwm_sums_df.tsv"))
## Rows: 6362 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): cluster, pattern_class, motif, component_pattern
## dbl (4): cwm_sum, abs_cwm_sum, cwm_mean, abs_cwm_mean
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
modisco_merged <- read_tsv(here("output/03-chrombpnet/02-compendium/modisco_compiled_anno/modisco_merged_reports.tsv")) %>% 
  # get short cluster ID
  left_join(cluster_meta %>% dplyr::select(Cluster, Cluster_ChromBPNet),
            by = c("component_celltype" = "Cluster_ChromBPNet")) %>% 
  left_join(cwm_means, by = "component_pattern")
## Rows: 6362 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): merged_pattern, component_celltype, pattern_class, pattern, compone...
## dbl (1): n_seqlets
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(modisco_merged)
## # A tibble: 6 × 14
##   merged_pattern            component_celltype pattern_class.x pattern n_seqlets
##   <chr>                     <chr>              <chr>           <chr>       <dbl>
## 1 neg.Average_12__merged_p… Spleen_c5          neg_patterns    patter…        88
## 2 neg.Average_12__merged_p… Spleen_c0          neg_patterns    patter…       461
## 3 neg.Average_12__merged_p… Skin_c5            neg_patterns    patter…       302
## 4 neg.Average_12__merged_p… Heart_c13          neg_patterns    patter…       128
## 5 neg.Average_12__merged_p… Skin_c7            neg_patterns    patter…       708
## 6 neg.Average_12__merged_p… Heart_c5           neg_patterns    patter…        47
## # ℹ 9 more variables: component_pattern <chr>, Cluster <chr>, cluster <chr>,
## #   pattern_class.y <chr>, motif <chr>, cwm_sum <dbl>, abs_cwm_sum <dbl>,
## #   cwm_mean <dbl>, abs_cwm_mean <dbl>

3.2 In silico marginalization results

ism_dir <- file.path(chrombpnet_dir, "03-syntax/04b/in_silico_marginalization/")
fs::dir_exists(ism_dir)
## /oak/stanford/groups/wjg/skim/projects/HDMA-public/output/03-chrombpnet/03-syntax/04b/in_silico_marginalization/ 
##                                                                                                             TRUE

List of composites to test:

compo_to_test <- read_tsv("04b-composites_to_test.tsv")
## Rows: 157 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): motif_name, Cluster, category, annotation_broad, component_motifA,...
## dbl  (8): n_hits_per_cluster, idx_uniq, variant_idx, total_hits, start_A, en...
## lgl  (3): composite_cwm_fwd, componentA_cwm_fwd, componentB_cwm_fwd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
compo_to_test$motif_name_safe <- gsub("\\/", "\\.", compo_to_test$motif_name)
compo_to_test$done <- ifelse(file.exists(paste0(ism_dir, compo_to_test$motif_name_safe, "/.done")),
                             TRUE,
                             FALSE)

sum(compo_to_test$done)
## [1] 138
compo_to_test_done <- compo_to_test %>% filter(done) %>% filter(test_spacing == "Y") %>%
  dplyr::rename(seqA = test_seqA, seqB = test_seqB)

Check for duplicates:

# sort the two motif sequences alphabetically so we can find duplicates even
# when the order is swapped
pairs <- map_dfr(1:nrow(compo_to_test_done), function(i) {
  
  motifs <- compo_to_test_done[i, c("seqA", "seqB")] %>% unlist() %>% sort()
  data.frame(motif_name = compo_to_test_done$motif_name[i],
             A = motifs[[1]],
             B = motifs[[2]])
  
})

# should be empty if no duplicates.
pairs %>% group_by(A, B) %>% mutate(n = n()) %>% filter(n > 1)
## # A tibble: 0 × 4
## # Groups:   A, B [0]
## # ℹ 4 variables: motif_name <chr>, A <chr>, B <chr>, n <int>

Outputs of the in silico marginalization experiments:

result_outs <- map_dfr(compo_to_test_done$motif_name_safe,
                       ~ data.table::fread(glue("{ism_dir}/{.x}/results.tsv"), data.table = FALSE)) %>% 
  left_join(compo_to_test_done %>% dplyr::select(motif_name, category), by = "motif_name") %>% 
  dplyr::select(-seqA_palindrome, -seqB_palindrome)

Z-scores on the marginal joint effects:

z_scores <- map2_dfr(compo_to_test_done$motif_name_safe, compo_to_test_done$motif_name,
                     function(motif_safe, motif) {
                       
                       df <- data.table::fread(glue("{ism_dir}/{motif_safe}/Z_scores.tsv"), data.table = FALSE) %>% 
                         mutate(motif_name = motif)
                       df
                       
                     })  %>% 
  left_join(compo_to_test_done %>% dplyr::select(motif_name, category), by = "motif_name")

Summarized effects for solo motifs:

solo_df <- map2_dfr(compo_to_test_done$motif_name_safe, compo_to_test_done$motif_name,
                    ~ data.table::fread(glue("{ism_dir}/{.x}/summarized_effects_solo.tsv"), data.table = FALSE) %>% 
                      mutate(motif_name = .y))  %>% 
  left_join(compo_to_test_done %>% dplyr::select(motif_name, category), by = "motif_name") %>% 
  dplyr::rename(orientation = key)

Summarized effects for spaced motifs:

spacing_df <- map2_dfr(compo_to_test_done$motif_name_safe, compo_to_test_done$motif_name,
                       ~ data.table::fread(glue("{ism_dir}/{.x}/summarized_effects_per_spacing.tsv"),
                                           data.table = FALSE) %>% 
                         mutate(motif_name = .y)) %>% 
  left_join(compo_to_test_done %>% dplyr::select(motif_name, category, seqA, seqB), by = "motif_name") %>% 
  dplyr::rename(orientation = key) %>% 
  rowwise() %>% 
  mutate(dist_center_to_center = round(nchar(seqA)/2) + spacing + round(nchar(seqB)/2))

Save intermediates:

save(spacing_df, solo_df, z_scores, result_outs, compo_to_test, file = glue("{out}/intermediate_obj.Rda"))

4 Palettes

blues <- RColorBrewer::brewer.pal(6, "Blues")
purples <- RColorBrewer::brewer.pal(6, "Purples")

palette_orientation_hetero <- c('A,B HT' = blues[3],
                                'B,A HT' = blues[4],
                                'HH'     = blues[5],
                                'TT'     = blues[6])

palette_orientation_homo <- c('HT' = purples[2],
                              'HH' = purples[4],
                              'TT' = purples[6])

palette_coop <- c("homo - not cooperative"   = "gray70",
                  "hetero - not cooperative" = "gray70",
                  "homo - cooperative"       = "#7696f5",
                  "hetero - cooperative"     = "#ab7ec2",
                  "homo - specific"          = "royalblue3",
                  "hetero - specific"        = "darkorchid4")

shapemap_result <- c("no synergy"  = 1,
                     "soft"        = 24,
                     "hard & soft" = 23,
                     "hard"        = 22)

cmap_category["not cooperative"] <- "gray80"

palette_motifs <- colorRampPalette(RColorBrewer::brewer.pal(9, "Blues")[3:9])(n=nrow(compo_to_test_done))

5 Data wrangling

We want to combine results into one dataframe:

# join results
all_results_best <- compo_to_test_done %>%
  dplyr::select(motif_name, Cluster, n_hits_per_cluster, idx_uniq, annotation_broad, component_motifA, component_motifB, seqA, seqB) %>% 
  left_join(result_outs, by = "motif_name") %>% 
  left_join(z_scores %>% dplyr::select(-category),
            by = c("motif_name", "best_orientation" = "orientation", "best_spacing" = "spacing")) %>% 
  left_join(spacing_df %>%
              dplyr::select(-category, -effect_mean, -effect_sd, -seqA, -seqB) %>% 
              dplyr::rename(best_counts_after_mean = counts_after_mean,
                            best_counts_after_sd = counts_after_sd),
            by = c("motif_name", "best_orientation" = "orientation", "best_spacing" = "spacing")) %>% 
  left_join(solo_df %>% 
              dplyr::filter(orientation == "A + B") %>% 
              dplyr::select(motif_name,
                            sum_counts_after_mean = counts_after_mean,
                            sum_counts_after_sd = counts_after_sd,
                            sum_effect = effect_mean)) %>% 
  # the maximum difference between joint vs independent effects across all arrangements
  mutate(joint_vs_ind_max = joint_effect - sum_effect)
## Joining with `by = join_by(motif_name)`
# adjust p-value
all_results_best$joint_vs_sum_p_value_adj <- p.adjust(all_results_best$joint_vs_sum_p_value, method = "BH")

We want to calculate the difference in joint/independent effects for each arrangement;

all_results_spacing <- spacing_df %>% 
  dplyr::rename(joint_effect = effect_mean) %>% 
  left_join(solo_df %>% filter(orientation == "A + B") %>%
              dplyr::select(-orientation, -counts_after_mean, -counts_after_sd, -effect_sd, sum_effect = effect_mean),
            by = c("motif_name", "category")) %>% 
  mutate(joint_vs_ind_per_arragement = joint_effect - sum_effect)

Let’s annotate the best orientation for each motif:

all_results_spacing$is_best_orientation <- FALSE

get_best_orientation <- function(motif) {
  
  all_results_best %>% filter(motif_name == motif) %>% pull(best_orientation)
  
}

motifs <- unique(all_results_spacing$motif_name)

for (i in seq_along(motifs)) {
  
  best_ori <- get_best_orientation(motifs[i])
  all_results_spacing[all_results_spacing$motif_name == motifs[i] & all_results_spacing$orientation == best_ori, ]$is_best_orientation <- TRUE
  
}

Let’s compute the max difference in joint vs ind. effects at the best orientation, in the 20-150bp range:

tmp <- all_results_spacing %>% 
  filter(is_best_orientation) %>% 
  filter(spacing %in% 20:150) %>%
  group_by(motif_name) %>%
  slice_max(n = 1, order_by = joint_effect) %>%
  mutate(joint_vs_ind_med_distance = joint_effect - sum_effect) %>% 
  dplyr::select(motif_name, joint_vs_ind_med_distance) %>% 
  ungroup()


all_results_best <- all_results_best %>% left_join(tmp, by = "motif_name")

6 Assess coooperativity and syntax requirements

6.1 Distribution of predicted counts

What’s the distribution of counts?

# first, get the mean
motif_order_mean <- spacing_df %>% 
  group_by(motif_name) %>% 
  summarize(mean_counts = mean(counts_after_mean)) %>% 
  arrange(desc(mean_counts)) %>% 
  pull(motif_name)

spacing_df %>% 
  mutate(motif_name = factor(motif_name, levels = motif_order_mean)) %>% 
  mutate(upper = counts_after_mean + counts_after_sd,
         lower = counts_after_mean - counts_after_sd) %>% 
  ggplot(aes(x = motif_name, y = counts_after_mean)) +
  geom_point(aes(fill = category), shape = 21, color = "white", size = 2, alpha = 0.5) +
  # geom_errorbar(aes(color = category, ymin = lower, ymax = upper), width = 0.1) +
  scale_fill_manual(values = cmap_category) +
  # scale_color_manual(values = cmap_category) +
  facet_grid(. ~ category, space = "free_x", scales = "free_x") +
  theme(strip.background = element_blank(),
        axis.text.x = element_text(size = 9)) +
  rotate_x() +
  ylab("Mean predicted log-counts for AB") +
  ggtitle("Distribution of predicted counts for all spacings/orientations of each comp. motif") +
  no_legend()

6.2 Account for marginal DeepSHAPs

We also manually verified that DeepSHAPs on the ISM confirm that the motifs we inserted are indeed the ones driving the predicted accessibility. We thus remove them from downstream analysis.

dim(all_results_best)
## [1] 138  30
motifs_no_contrib_support <- as.numeric(c("33", "40", "56", "72", "97", "104", "105", "152",
                                          "164", "187", "191", "217", "281", "310", "338",
                                          "349", "357", "413"))

6.3 Assign results

p_val_thresh     <- 0.001
z_thresh         <- 4
delta_thresh     <- 0.15

# decide if significant
all_results_best <- all_results_best %>%
  # we exclude cases where the contributions at the optimal arrangement didn't
  # reflect the motifs we inserted, since it tells us the models are not performing
  # the experiment we intended/not responding to those sequences
  filter(!(idx_uniq %in% motifs_no_contrib_support)) %>% 
  mutate(result = case_when(
    joint_vs_sum_p_value_adj < p_val_thresh & joint_vs_ind_max > delta_thresh & joint_vs_ind_med_distance > delta_thresh & max_Z > z_thresh ~ "hard & soft",
    joint_vs_sum_p_value_adj < p_val_thresh & joint_vs_ind_max > delta_thresh & max_Z > z_thresh ~ "hard",
    joint_vs_sum_p_value_adj < p_val_thresh & joint_vs_ind_med_distance > delta_thresh ~ "soft",
    TRUE ~ "no synergy"))

all_results_spacing <- all_results_spacing %>%
  left_join(all_results_best %>% dplyr::select(motif_name, result), by = "motif_name") %>% 
  # remove filtered-out motifs
  filter(!is.na(result)) %>% 
  mutate(orientation = ifelse(orientation == "HT", "A,B HT", orientation),
         result = factor(result, levels = rev(names(shapemap_result))))

z_scores <- z_scores %>% mutate(orientation = ifelse(orientation == "HT", "A,B HT", orientation))

6.4 Account for palindromic sequences

First, let’s basically score the similarity of each motif to its reverse complement. Importantly, since we are operating with specific sequences for each motif, we’ll quantify similarity on that sequence rather than the motif it originates from. We’ll look at the distribution to understand what are palindromes, quasi-palindromes, and non-palindromes.

seqs_tested <- unique(c(all_results_best$seqA, all_results_best$seqB))
length(seqs_tested)
## [1] 80
# convert these to motifs -- basically the PWMs will be the one-hot encoded versions
seqs_tested_ppm <- map(seqs_tested, ~ universalmotif::create_motif(.x, name = .x))
names(seqs_tested_ppm) <- seqs_tested

# reverse complement
# revcomp returns just the RC'd motif, so we need create_motif to recreate a motif class
# in-place modification to set the name as the reverse complemented sequence
# which is just the consensus sequence from the reverse complemented PPM of the original motif
seqs_tested_ppm_rc <- map(seqs_tested_ppm, ~ universalmotif::create_motif(input = revcomp(.x@motif)) %>% 
                            set_attr("name", .@consensus))

# iterate over the two lists and calculate the similarity of a sequence to its
# reverse complement
self_sim <- map2_dfr(seqs_tested_ppm, seqs_tested_ppm_rc, function(fw, rev) {
  
  data.frame("motif_fw" = fw@name,
             "motif_rev" = rev@name,
             "similarity" = universalmotif::compare_motifs(motifs = list(fw, rev), tryRC = FALSE, use.type = "PPM") %>% .[1, 2])
  
})

Note that even some motifs which have self-similarity == 1 are not perfectly palindromic, as they may differ by one nucleotide on one side, e.g. CTAATTA and its reverse complement TAATTAG.

p_sim <- self_sim %>% 
  arrange(similarity) %>% 
  mutate(motifs = paste0(motif_fw, ", ", motif_rev)) %>% 
  mutate(motifs = factor(motifs, levels = unique(.$motifs))) %>% 
  ggplot(aes(x = motifs, y = similarity, label = motifs)) +
  geom_hline(yintercept = 0.7, color = "black") +
  geom_point(color = "red") +
  # scale_color_manual(values = cmap_category) +
  coord_flip()

ggplotly(p_sim)

Get categories which are palindromic:

anno_pal <- self_sim %>% 
  filter(motif_fw == motif_rev) %>% 
  dplyr::pull(motif_fw) %>% 
  unique()

anno_quasipal <- self_sim %>% 
  filter(similarity >= 0.7) %>% 
  dplyr::pull(motif_fw) %>% 
  unique()

anno_quasipal <- base::setdiff(anno_quasipal, anno_pal)

# sanity check, should be 0
base::intersect(anno_pal, anno_quasipal)
## character(0)
all_results_best <- all_results_best %>% 
  mutate(
    seqA_palindrome = case_when(
      seqA %in% anno_pal ~ "P",
      seqA %in% anno_quasipal ~ "Q",
      grepl("BHLH|BZIP|NFI", component_motifA)  ~ "Q",
      TRUE ~ "-"),
    seqB_palindrome = case_when(
      seqB %in% anno_pal ~ "P",
      seqB %in% anno_quasipal ~ "Q",
      grepl("BHLH|BZIP|NFI", component_motifB)  ~ "Q",
      TRUE ~ "-")) %>% 
  # a heuristic way to sort motifs
  mutate(palindromicity = case_when(
    seqA_palindrome == "P" & seqB_palindrome == "P" ~ 5,
    seqA_palindrome != "-" & seqB_palindrome != "-" ~ 4,
    seqA_palindrome == "P" | seqB_palindrome == "P" ~ 3, 
    seqA_palindrome != "-" & seqB_palindrome == "-" ~ 2.5,
    seqB_palindrome != "-" & seqA_palindrome == "-" ~ 2,
    seqA_palindrome == "-" & seqB_palindrome == "-" ~ 1
  )) %>% 
  mutate(result = factor(result, levels = c("hard", "hard & soft", "soft", "no synergy")))

all_results_best %>% write_tsv(glue("{out}/composite_motif_ISM_results.tsv"))

6.5 Tally results

Tally up the results:

dim(all_results_best)
## [1] 120  34
table(all_results_best$result)
## 
##        hard hard & soft        soft  no synergy 
##          39           9          19          53
# proportions
table(all_results_best$result)/sum(table(all_results_best$result))
## 
##        hard hard & soft        soft  no synergy 
##   0.3250000   0.0750000   0.1583333   0.4416667
DT::datatable(all_results_best)
all_results_best %>%
  mutate(result = factor(result, levels = c("hard", "hard & soft", "soft", "no synergy"))) %>%
  ggplot(aes(x = result)) + geom_bar(fill = "black") + rotate_x() +
  scale_y_continuous(breaks = seq(5, 55, 5))

6.6 Prep table

all_results_best %>%
  dplyr::select(motif_name, idx_uniq, category, result, Cluster_ChromBPNet = Cluster,
                annotation_broad, component_motifA, component_motifB, seqA, seqB,
                best_orientation,
                best_distance_between_motifs = best_spacing,
                best_dist_centers = dist_center_to_center, best_seq,
                joint_effect_log_counts = joint_effect,
                joint_vs_sum_p_value,
                joint_vs_sum_p_value_adj,
                Z_scored_joint_effect = Z,
                independent_effect = sum_effect,
                seqA_palindrome,
                seqB_palindrome) %>% 
  mutate(result = factor(result, levels = c("hard", "hard & soft", "soft", "no synergy"))) %>% 
  arrange(result) %>% 
  write_csv(glue("{out}/TABLE_ism_synergy_results.csv"))

7 Overview of scores

7.1 Distance between motifs

What’s the distribution of spacing we see for specific interactions?

df1 <- all_results_best %>% 
  filter(result %in% c("hard", "hard & soft")) %>%
  group_by(result, best_spacing, category) %>% 
  count() %>% 
  dplyr::rename(distance = best_spacing) %>% 
  mutate(type = "dist_spacing")

df2 <- all_results_best %>% 
  filter(result %in% c("hard", "hard & soft")) %>%
  group_by(result, dist_centers, category) %>% 
  count() %>% 
  mutate(n = -n) %>% 
  dplyr::rename(distance = dist_centers) %>% 
  mutate(type = "dist_centers")

bind_rows(df1, df2) %>% 
  ggplot(aes(x = distance, y = n)) +
  geom_hline(yintercept = 0) +
  geom_col(aes(fill = type)) +
  scale_fill_manual(values = c("dist_spacing" = "black", "dist_centers" = "darkorchid4")) +
  xlab("distance (bp)") + ylab("Number of motif pairs") +
  scale_x_continuous(breaks = seq(0, 19, by = 1), limits = c(-1, 18)) +
  scale_y_continuous(breaks = seq(-5, 10)) +
  no_legend() +
  theme(axis.text.x = element_text(size = 9))

7.2 Prevalence and importance of syntax motifs

First, we can ask among all 508 motifs, where the do the motifs w/ syntax effects rank. However, motifs which are cell type specific may have their signal diluted here by motifs which are ubiquitous and frequent.

We will focus on positive patterns for this analysis.

cmap_synergy <- c("hard" = "red3",
                  "hard & soft" = "red3",
                  "soft" = "salmon",
                  "no synergy" = "gray40",
                  "not tested" = "gray90")

motifs_w_results <- all_results_best %>%
  mutate(result = as.character(result)) %>% 
  dplyr::select(motif_name, result) %>% 
  # filter(result %in% c("hard", "hard & soft", "soft")) %>% 
  right_join(motifs_compiled_unique) %>% 
  mutate(result = ifelse(is.na(result), "not tested", result)) %>% 
  arrange(total_hits) %>% 
  filter(pattern_class == "pos_patterns") %>% 
  dplyr::select(motif_name, result, total_hits, category) %>% 
  mutate(rank = dense_rank(-total_hits))
## Joining with `by = join_by(motif_name)`
table(motifs_w_results$result)
## 
##        hard hard & soft  no synergy  not tested        soft 
##          39           9          53         373          19
print(dim(motifs_w_results))
## [1] 493   5
hits_motifs_w_hard_syntax <- motifs_w_results %>% filter(result %in% c("hard", "hard & soft")) %>%
  pull(total_hits)
hits_motifs_w_soft_syntax <- motifs_w_results %>% filter(result %in% c("soft")) %>%
  pull(total_hits)
hits_motifs_w_no_syntax <- motifs_w_results %>% filter(result %in% c("no synergy", "not tested")) %>% pull(total_hits)

length(hits_motifs_w_hard_syntax)
## [1] 48
length(hits_motifs_w_soft_syntax)
## [1] 19
length(hits_motifs_w_no_syntax)
## [1] 426
(ks_out <- ks.test(hits_motifs_w_hard_syntax, hits_motifs_w_no_syntax))
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  hits_motifs_w_hard_syntax and hits_motifs_w_no_syntax
## D = 0.091843, p-value = 0.86
## alternative hypothesis: two-sided
(ks_out2 <- ks.test(hits_motifs_w_soft_syntax, hits_motifs_w_no_syntax))
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  hits_motifs_w_soft_syntax and hits_motifs_w_no_syntax
## D = 0.20064, p-value = 0.4567
## alternative hypothesis: two-sided
motifs_w_results %>% 
  mutate(result = factor(result, levels = names(cmap_synergy))) %>% 
  ggplot(aes(x = rank, y = log10(total_hits))) +
  geom_hline(yintercept = 0) +
  geom_point(aes(color = result), alpha = 0.7, size = 3) +
  geom_segment(data = motifs_w_results %>% filter(!(result %in% c("not tested", "no synergy"))),
               mapping = aes(x = rank, y = 0,
                             xend = rank, yend = -1/3,
                             color = result),
               size = 1) +
  # geom_line(aes(group = result)) +
  scale_color_manual(values = cmap_synergy) +
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank()) +
  annotate("text",
           label = glue("KS test p-value for motifs \nw/ hard syntax vs others: \np={round(ks_out$p.value, 3)}"),
           x = 250, y = 6, size = 4) +
  xlab("motif rank")

This means at the global level, motifs w/ hard/soft syntax don’t seem more prevalent based on number of hits.

Which cell types have at least one motif w/ hard/soft syntax?

# this includes all 189 clusters
length(unique(modisco_merged$Cluster))
## [1] 189
# motifs with hard & soft syntax are being counted as hard here:
orig_motifs_joined_w_syntax <- motifs_compiled_unique %>% 
  filter(pattern_class == "pos_patterns") %>% 
  dplyr::select(motif_name, pattern) %>% 
  left_join(all_results_best %>% dplyr::select(motif_name, result), by = "motif_name") %>% 
  mutate(result = case_when(
    is.na(result) ~ "not tested",
    result == "hard & soft" ~ "hard",
    TRUE ~ result)) %>% 
  left_join(modisco_merged %>% dplyr::select(merged_pattern, Cluster_ChromBPNet = component_celltype, cwm_mean, cwm_sum), by = c("pattern" = "merged_pattern")) %>% 
  left_join(cluster_meta %>% dplyr::select(Cluster_ChromBPNet, organ))
## Joining with `by = join_by(Cluster_ChromBPNet)`
counts_motifs_per_type_per_celltype <- orig_motifs_joined_w_syntax %>% 
  filter(result %in% c("hard", "soft")) %>% 
  group_by(organ, Cluster_ChromBPNet, result) %>% 
  count()

head(counts_motifs_per_type_per_celltype)
## # A tibble: 6 × 4
## # Groups:   organ, Cluster_ChromBPNet, result [6]
##   organ   Cluster_ChromBPNet result     n
##   <chr>   <chr>              <chr>  <int>
## 1 Adrenal Adrenal_c0         soft       2
## 2 Adrenal Adrenal_c1         soft       1
## 3 Adrenal Adrenal_c2         soft       2
## 4 Adrenal Adrenal_c3         soft       1
## 5 Brain   Brain_c0           soft       1
## 6 Brain   Brain_c1           hard       1
table(counts_motifs_per_type_per_celltype$result)
## 
## hard soft 
##  115  113
(props <- table(counts_motifs_per_type_per_celltype$result)/189)
## 
##      hard      soft 
## 0.6084656 0.5978836
length(unique(counts_motifs_per_type_per_celltype$Cluster_ChromBPNet))/189
## [1] 0.8148148
counts_motifs_per_type_per_celltype %>% 
  mutate(n = ifelse(result == "soft", -n, n)) %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = n)) +
  geom_bar(aes(fill = organ), stat = "identity") +
  geom_hline(yintercept = 0) +
  scale_fill_manual(values = cmap_organ) +
  xlab("cluster") +
  rotate_x() +
  theme(axis.text.x = element_text(size = 11)) +
  scale_y_continuous(breaks = seq(-4, 10, by = 2)) +
  ylab("<--soft---     number of motifs w/ significant effect     ----hard--->") +
  ggtitle(glue("prop. of cell types w/ at least one hard syntax motif: {round(props['hard'], 2)}\nprop. of cell types w/ at least one soft syntax motif: {round(props['soft'], 2)}"))

What about prevalence (in terms of number of instances) within cell types?

# these are the motifs from all original clusters, that did not get clustered
# into patterns that were exluded due to QC
orig_motifs_joined_w_syntax %>% dim
## [1] 5334    7
orig_motifs_joined_w_syntax2 <- orig_motifs_joined_w_syntax %>% 
  left_join(hits_all_anno %>% dplyr::select(motif_name, Cluster_ChromBPNet = Cluster, n_hits = n), by = c("motif_name", "Cluster_ChromBPNet")) %>% 
  mutate(result = factor(result, levels = rev(names(cmap_synergy)))) %>% 
  filter(!is.na(n_hits))

# orig_motifs_joined_w_syntax2 %>% 
#   ggplot(aes(x = Cluster_ChromBPNet, y = log10(n_hits))) +
#   geom_point(aes(color = result), alpha = 0.7) +
#   scale_color_manual(values = cmap_synergy) +
#   rotate_x() +
#   xlab("cluster") + ylab("log10(number of hits)") +
#   ggtitle("number of hits per motif per cluster \n(1 point = 1 motif in 1 cluster)")

Representing these as normalized ranks:

motifs_ranked <- orig_motifs_joined_w_syntax2 %>% 
  group_by(Cluster_ChromBPNet) %>% 
  mutate(rank = dense_rank(n_hits)) %>% 
  mutate(norm_rank = rank/max(rank))

any(is.na(motifs_ranked$norm_rank))
## [1] FALSE
motifs_ranked %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = norm_rank)) +
  geom_point(aes(color = result), alpha = 0.7) +
  scale_color_manual(values = cmap_synergy) +
  rotate_x() +
  xlab("cluster") + ylab("normalized rank within the cluster (rank / max_rank)") +
  ggtitle("normalized rank by number of hits per motif per cluster \n(1 point = 1 motif in 1 cluster)")

Now we can do a similar analysis ranking motifs by CWM mean:

# these are the motifs from all original clusters, that did not get clustered
# into patterns that were exluded due to QC
orig_motifs_joined_w_syntax %>% dim
## [1] 5334    7
# orig_motifs_joined_w_syntax2 %>% 
#   ggplot(aes(x = Cluster_ChromBPNet, y = cwm_mean)) +
#   geom_point(aes(color = result), alpha = 0.7) +
#   scale_color_manual(values = cmap_synergy) +
#   rotate_x() +
#   xlab("cluster") + ylab("mean CWM importance") +
#   ggtitle("mean importance per trimmed motif per cluster \n(1 point = 1 motif in 1 cluster)")

Representing these as normalized ranks:

motifs_ranked2 <- orig_motifs_joined_w_syntax2 %>% 
  group_by(Cluster_ChromBPNet) %>% 
  mutate(rank = dense_rank(cwm_mean)) %>% 
  mutate(norm_rank = rank/max(rank))

motifs_ranked2 %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = norm_rank)) +
  geom_point(aes(color = result), alpha = 0.7) +
  scale_color_manual(values = cmap_synergy) +
  rotate_x() +
  xlab("cluster") + ylab("normalized rank within the cluster (rank / max_rank)") +
  ggtitle("normalized rank by mean importance per motif per cluster \n(1 point = 1 motif in 1 cluster)")

We can also rank by the CWM sum:

# these are the motifs from all original clusters, that did not get clustered
# into patterns that were exluded due to QC
orig_motifs_joined_w_syntax %>% dim
## [1] 5334    7
orig_motifs_joined_w_syntax2 %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = cwm_sum)) +
  geom_point(aes(color = result), alpha = 0.7) +
  scale_color_manual(values = cmap_synergy) +
  rotate_x() +
  xlab("cluster") + ylab("total CWM importance") +
  ggtitle("summed importance per trimmed motif per cluster \n(1 point = 1 motif in 1 cluster)")

Representing these as normalized ranks:

motifs_ranked3 <- orig_motifs_joined_w_syntax2 %>% 
  group_by(Cluster_ChromBPNet) %>% 
  mutate(rank = dense_rank(cwm_sum)) %>% 
  mutate(norm_rank = rank/max(rank))

motifs_ranked3 %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = norm_rank)) +
  geom_point(aes(color = result), alpha = 0.7) +
  scale_color_manual(values = cmap_synergy) +
  rotate_x() +
  xlab("cluster") + ylab("normalized rank within the cluster (rank / max_rank)") +
  ggtitle("normalized rank by total importance per motif per cluster \n(1 point = 1 motif in 1 cluster)")

In cell types that have these types of motifs, what’s the mean normalized rank for number of hits or summed importance for motifs with a given type of synergy, among all motifs learned in that cell type?

motifs_ranked_mean <- motifs_ranked %>% 
  group_by(Cluster_ChromBPNet, result) %>% 
  summarize(mean_norm_rank = mean(norm_rank)) %>% 
  mutate(result = factor(result, levels = names(cmap_synergy)))
## `summarise()` has grouped output by 'Cluster_ChromBPNet'. You can override
## using the `.groups` argument.
length(unique(motifs_ranked$Cluster_ChromBPNet))
## [1] 189
table(motifs_ranked_mean$result)
## 
##        hard hard & soft        soft  no synergy  not tested 
##         115           0         113         100         189
# get the ranks per group
ranks1 <- map(c("not tested", "hard", "soft", "no synergy"),
              ~ motifs_ranked_mean %>% filter(result == .x) %>% pull(mean_norm_rank)) %>% set_names(c("not tested", "hard", "soft", "no synergy"))

# Wilcoxon rank-sum tests
tibble::tribble(
  ~ "comparison",       ~ "p_value",
  "hard vs not tested", wilcox.test(ranks1$hard, ranks1$not_tested,   paired = FALSE)$p.value,
  "hard vs soft",       wilcox.test(ranks1$hard, ranks1$soft,         paired = FALSE)$p.value,
  "hard vs no synergy", wilcox.test(ranks1$hard, ranks1$`no synergy`, paired = FALSE)$p.value,
  "soft vs not tested", wilcox.test(ranks1$soft, ranks1$not_tested,   paired = FALSE)$p.value,
  "soft vs no synergy", wilcox.test(ranks1$soft, ranks1$`no synergy`, paired = FALSE)$p.value)
## # A tibble: 5 × 2
##   comparison          p_value
##   <chr>                 <dbl>
## 1 hard vs not tested 1.33e-20
## 2 hard vs soft       1.07e-26
## 3 hard vs no synergy 7.15e-13
## 4 soft vs not tested 2.83e-20
## 5 soft vs no synergy 3.17e- 4

And for the mean importance-based rank:

motifs_ranked_mean2 <- motifs_ranked2 %>% 
  group_by(Cluster_ChromBPNet, result) %>% 
  summarize(mean_norm_rank = mean(norm_rank)) %>% 
  mutate(result = factor(result, levels = names(cmap_synergy)))
## `summarise()` has grouped output by 'Cluster_ChromBPNet'. You can override
## using the `.groups` argument.
length(unique(motifs_ranked2$Cluster_ChromBPNet))
## [1] 189
table(motifs_ranked_mean2$result)
## 
##        hard hard & soft        soft  no synergy  not tested 
##         115           0         113         100         189
# get the ranks per group
ranks2 <- map(c("not tested", "hard", "soft", "no synergy"),
              ~ motifs_ranked_mean2 %>% filter(result == .x) %>% pull(mean_norm_rank)) %>% set_names(c("not tested", "hard", "soft", "no synergy"))

# Wilcoxon rank-sum tests
tibble::tribble(
  ~ "comparison",       ~ "p_value",
  "hard vs not tested", wilcox.test(ranks2$hard, ranks2$not_tested,   paired = FALSE)$p.value,
  "hard vs soft",       wilcox.test(ranks2$hard, ranks2$soft,         paired = FALSE)$p.value,
  "hard vs no synergy", wilcox.test(ranks2$hard, ranks2$`no synergy`, paired = FALSE)$p.value,
  "soft vs not tested", wilcox.test(ranks2$soft, ranks2$not_tested,   paired = FALSE)$p.value,
  "soft vs no synergy", wilcox.test(ranks2$soft, ranks2$`no synergy`, paired = FALSE)$p.value)
## # A tibble: 5 × 2
##   comparison          p_value
##   <chr>                 <dbl>
## 1 hard vs not tested 1.33e-20
## 2 hard vs soft       7.32e- 9
## 3 hard vs no synergy 4.37e- 1
## 4 soft vs not tested 2.84e-20
## 5 soft vs no synergy 1.98e- 9

And for the summed importance-based rank:

motifs_ranked_mean3 <- motifs_ranked3 %>% 
  group_by(Cluster_ChromBPNet, result) %>% 
  summarize(mean_norm_rank = mean(norm_rank)) %>% 
  mutate(result = factor(result, levels = names(cmap_synergy)))
## `summarise()` has grouped output by 'Cluster_ChromBPNet'. You can override
## using the `.groups` argument.
length(unique(motifs_ranked2$Cluster_ChromBPNet))
## [1] 189
table(motifs_ranked_mean3$result)
## 
##        hard hard & soft        soft  no synergy  not tested 
##         115           0         113         100         189
# get the ranks per group
ranks3 <- map(c("not tested", "hard", "soft", "no synergy"),
              ~ motifs_ranked_mean3 %>% filter(result == .x) %>% pull(mean_norm_rank)) %>% set_names(c("not tested", "hard", "soft", "no synergy"))

# Wilcoxon rank-sum test
tibble::tribble(
  ~ "comparison",             ~ "p_value",
  "hard vs not tested",       wilcox.test(ranks3$hard,         ranks3$not_tested,   paired = FALSE)$p.value,
  "hard vs soft",             wilcox.test(ranks3$hard,         ranks3$soft,         paired = FALSE)$p.value,
  "hard vs no synergy",       wilcox.test(ranks3$hard,         ranks3$`no synergy`, paired = FALSE)$p.value,
  "soft vs not tested",       wilcox.test(ranks3$soft,         ranks3$not_tested,   paired = FALSE)$p.value,
  "soft vs no synergy",       wilcox.test(ranks3$soft,         ranks3$`no synergy`, paired = FALSE)$p.value,
  "no synergy vs not tested", wilcox.test(ranks3$`no synergy`, ranks3$`not tested`, paired = FALSE)$p.value)
## # A tibble: 6 × 2
##   comparison                p_value
##   <chr>                       <dbl>
## 1 hard vs not tested       1.33e-20
## 2 hard vs soft             1.90e- 3
## 3 hard vs no synergy       5.34e- 7
## 4 soft vs not tested       2.83e-20
## 5 soft vs no synergy       8.00e-13
## 6 no synergy vs not tested 5.53e- 2
p1 <- motifs_ranked_mean %>% 
  ggplot(aes(x = result, y = mean_norm_rank)) +
  geom_boxplot(aes(fill = result), outliers = FALSE, fill = "white") +
  # scale_fill_manual(values = cmap_synergy) +
  geom_jitter(width = 0.2, shape = 21, color = "gray60", alpha = 0.8) +
  no_legend() +
  xlab("type of motif") + ylab("mean normalized rank") +
  ggtitle("normalized ranks for each type of motif, \nacross cell types", subtitle = "based on number of hits (1 point = 1 cell type)" ) +
  ylim(c(0, 1))

p2 <- motifs_ranked_mean2 %>% 
  ggplot(aes(x = result, y = mean_norm_rank)) +
  geom_boxplot(aes(fill = result), outliers = FALSE, fill = "white") +
  # scale_fill_manual(values = cmap_synergy) +
  geom_jitter(width = 0.2, shape = 21, color = "gray60", alpha = 0.8) +
  no_legend() +
  xlab("type of motif") + ylab("mean normalized rank") +
  ggtitle("normalized ranks for each type of motif, \nacross cell types", subtitle = "based on mean trimmed CWM importance (1 point = 1 cell type)") +
  ylim(c(0, 1))

p3 <- motifs_ranked_mean3 %>% 
  ggplot(aes(x = result, y = mean_norm_rank)) +
  geom_boxplot(aes(fill = result), outliers = FALSE, fill = "white") +
  # scale_fill_manual(values = cmap_synergy) +
  geom_jitter(width = 0.2, shape = 21, color = "gray60", alpha = 0.8) +
  no_legend() +
  xlab("type of motif") + ylab("mean normalized rank") +
  ggtitle("normalized ranks for each type of motif, \nacross cell types", subtitle = "based on total trimmed CWM importance (1 point = 1 cell type)") +
  ylim(c(0, 1))

plot_grid(p1, #+ ggtitle(NULL, subtitle = NULL),
          p2, #+ ggtitle(NULL, subtitle = NULL),
          p3, #+ ggtitle(NULL, subtitle = NULL),
          nrow = 1)

7.3 LFC, Z-score, and counts

p0 <- all_results_best %>% 
  ggplot(aes(x = sum_effect, y = joint_effect, label = motif_name)) +
  geom_abline(slope = 1, linetype = "dashed", color = "black") +
  geom_point(aes(size = -log10(joint_vs_sum_p_value_adj), fill = category, shape = result, color = category), alpha = 0.7, stroke = 1) +
  scale_fill_manual(values = cmap_category) +
  scale_color_manual(values = cmap_category) +
  scale_shape_manual(values = shapemap_result) +
  # scale_fill_manual(values = palette_coop) +
  ylab("Joint effect") +
  xlab("Independent effect") +
  xlim(c(0, 2)) + ylim(c(0, 2)) +
  square() +
  theme(legend.position = "bottom")

shapemap_result2 <- shapemap_result
shapemap_result2["no synergy"] <- 21

p0_2 <- all_results_best %>%
  mutate(max_Z = ifelse(result == "no synergy", NA, max_Z)) %>%
  ggplot(aes(x = sum_effect, y = joint_effect, label = motif_name)) +
  geom_abline(slope = 1, linetype = "dashed", color = "black") +
  geom_point(aes(fill = max_Z, size = -log10(joint_vs_sum_p_value_adj), shape = result), alpha = 0.7, color = "black") +
  scale_fill_gradientn(colors = ylrd, na.value = "gray70") +
  scale_color_gradientn(colors = ylrd, na.value = "gray70") +
  scale_shape_manual(values = shapemap_result2) +
  ylab("AB marginal effect, ln(counts)") +
  xlab("A+B marignal effect, ln(counts)") +
  xlim(c(0, 2)) + ylim(c(0, 2)) +
  square() +
  theme(legend.position = "bottom")

plot_grid(p0, p0_2, nrow = 1, align = "h", axis = "tb")

ggplotly(p0_2)

7.4 Behaviour across 200bp

This implies that our categories have different overall behavirous across 200 bp.

all_results_spacing %>% 
  filter(is_best_orientation) %>% 
  ggplot(aes(x = dist_center_to_center, y = joint_vs_ind_per_arragement, group = motif_name, label = motif_name)) +
  geom_hline(yintercept = 0, color = "black", alpha = 0.3, lwd = 1) +
  geom_line(alpha = 0.3, aes(color = motif_name)) +
  annotate('point', x = 10.5, y = 1.4,
           shape = 25, size = 2, color = 'red', fill = 'red', alpha = 0.5) +
  annotate('text', x = 35, y = 1.4, color = 'red', label = "10.5 bp") +
  facet_wrap(~ result, ncol = 1) +
  ylab("AB - (A+B) effects in log-counts") + xlab("center-to-center distance (bp)") +
  scale_color_manual(values = palette_motifs) +
  no_legend() +
  ylim(c(-0.5, 1.5))

7.5 Heatmap representation

# motif order - by result, then palindromes, then Z
motif_order <- all_results_best %>%
  mutate(result = factor(result, levels = rev(names(shapemap_result)))) %>% 
  # sort by increasing z-score
  arrange(result, max_Z) %>% 
  pull(motif_name) %>% 
  unique()

motif_order2 <- all_results_best %>%
  mutate(result = factor(result, levels = rev(names(shapemap_result)))) %>% 
  # sort by increasing z-score
  arrange(category, palindromicity, max_Z) %>% 
  pull(motif_name) %>% 
  unique()

results_w_spacing <- all_results_spacing %>% 
  mutate(result = factor(result, levels = rev(names(shapemap_result)))) %>% 
  arrange(result, joint_effect) %>% 
  mutate(motif_name = factor(motif_name, levels = motif_order),
         category = factor(category, levels = c("homocomposite", "heterocomposite")))

results_w_zscore <- all_results_spacing %>% 
  left_join(z_scores, by = c("orientation", "spacing", "motif_name", "category")) %>% 
  mutate(result = factor(result, levels = rev(names(shapemap_result)))) %>% 
  arrange(result, Z) %>% 
  mutate(motif_name = factor(motif_name, levels = motif_order),
         category = factor(category, levels = c("homocomposite", "heterocomposite")),
         # cap the Z-score at 5 for better visualization of smaller Z-scores
         # TODO: edit legend to indicate >=N
         Z = ifelse(Z > 5, 5, Z))

7.5.1 Specific pairs

Helper function to plot the heatmap for one motif pair, similarly to Taipale lab 2015 paper:

#' @param motif character, motif namee
#' Z-scored effect
plot_heatmap_per_pair <- function(motif, n_bp = 20) {
  
  orientations <- c("HH", "TT", "A,B HT", "B,A HT")
  
  p1 <- results_w_spacing %>% 
    filter(motif_name == motif) %>% 
    filter(spacing < n_bp) %>% 
    mutate(orientation = factor(orientation, levels = rev(orientations))) %>% 
    ggplot(aes(x = spacing, y = orientation)) +
    geom_tile(aes(fill = joint_effect), color = "white") +
    scale_fill_gradientn(colors = viridis::magma(100)) +
    theme(strip.background = element_blank(),
          panel.border = element_blank(),
          # axis.text.y = element_text(size = 9),
          axis.ticks.y = element_blank(),
          strip.text.y = element_blank()) +
    xlab("spacing (bp)") + ylab("orientations") + labs(fill = "effect_mean") +
    ggtitle(motif)
  
  p2 <- results_w_zscore %>% 
    filter(motif_name == motif) %>% 
    filter(spacing < n_bp) %>% 
    mutate(orientation = factor(orientation, levels = rev(orientations))) %>% 
    ggplot(aes(x = spacing, y = orientation)) +
    geom_tile(aes(fill = Z),color = "black") +
    scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
    theme(strip.background = element_blank(),
          panel.border = element_blank(),
          # axis.text.y = element_(),
          axis.ticks.y = element_blank()) +
    xlab("spacing (bp)") + ylab("orientations") + labs(fill = "Z-scored effect")
  
  plot_grid(p1, p2, ncol = 1, align = "v", axis = "rl", rel_heights = c(1.2, 1))
  
}
plot_heatmap_per_pair("467|p53_p53#1")

plot_heatmap_per_pair("434|SOX_SOX#1")

7.5.2 Heatmaps across spacings

Let’s try a version where for each motif, we only select the best orientation and show the results at that one:

all_results_spacing %>% 
  mutate(motif_name = factor(motif_name, levels = motif_order2)) %>% 
  ggplot(aes(x = spacing, y = motif_name)) +
  geom_tile(aes(fill = joint_vs_ind_per_arragement)) +
  facet_grid(result ~ ., scales = "free_y", space = "free_y") + 
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  theme(strip.background = element_blank(),
        strip.text.y = element_text(angle = 0),
        panel.border = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 5)) +
  ggtitle("Joint - independent effect") +
  xlab("spacing (bp)") + ylab(NULL)

7.5.3 Composite motifs w/ syntax effects

We can reformat the data a bit to get the plots for all “specific” cooperative motifs, including both homocomposites and heterocomposites:

p1 <- results_w_spacing %>%
  mutate(motif_name = factor(motif_name, levels = motif_order2)) %>% 
  filter(result %in% c("hard", "hard & soft")) %>%
  filter(spacing < 20) %>%
  ggplot(aes(x = spacing, y = motif_name)) +
  geom_tile(aes(fill = joint_effect)) +
  facet_grid(category ~ orientation, scales = "free_y", space = "free_y") +
  scale_fill_gradientn(colors = viridis::magma(100)) +
  theme(strip.background = element_blank(),
        panel.border = element_blank(),
        axis.text.y = element_text(size = 9),
        axis.ticks.y = element_blank(),
        strip.text.y = element_blank(),
        legend.position = "bottom") +
  ggtitle("Marginal joint effect") +
  xlab("spacing (bp)") + ylab("motif pair")

p2 <- results_w_zscore %>% 
  mutate(motif_name = factor(motif_name, levels = motif_order2)) %>% 
  filter(result %in% c("hard", "hard & soft")) %>%
  filter(spacing < 20) %>% 
  ggplot(aes(x = spacing, y = motif_name)) +
  geom_tile(aes(fill = Z)) +
  facet_grid(category ~ orientation, scales = "free_y", space = "free_y") + 
  scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) +
  theme(strip.background = element_blank(),
        panel.border = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "bottom") +
  ggtitle("Z-scored effect") +
  xlab("spacing (bp)") + ylab(NULL)

# plot_grid(p1, p2, nrow = 1, rel_widths = c(1.4, 1))

Plot some features of these motifs

coop_subset <- motifs_compiled_unique %>%
  right_join(all_results_best %>% dplyr::select(motif_name, result, Z, joint_effect), by = "motif_name") %>% 
  filter(result %in% c("hard", "hard & soft"))

dim(coop_subset)
## [1] 48 49
motifs_filt <- coop_subset %>% mutate(annotation_broad = motif_name)
hits_annotated <- hits_all_anno %>% mutate(annotation_broad = motif_name) %>% 
  filter(motif_name %in% coop_subset$motif_name)

# group by broad annotation and calculate summary stats
motifs_to_plot <- motifs_filt %>% 
  group_by(annotation_broad) %>% 
  mutate(total_hits_broad = sum(total_hits),
         n_variants = n()) %>%
  # for visualization of the logo, use the motif w/ the most hits
  slice_max(n = 1, order_by = total_hits) %>% 
  arrange(desc(total_hits_broad))

# plot breakdown of total hits based on cell type compartment
p4 <- hits_annotated %>%
  dplyr::select(annotation_broad, compartment, category, n) %>%
  group_by(annotation_broad, compartment, category) %>%
  summarize(n_hits = sum(n)) %>%
  mutate(annotation_broad = factor(annotation_broad, levels = motif_order2)) %>%
  ggplot(aes(y = annotation_broad, x = n_hits)) +
  geom_col(aes(fill = compartment), position = "fill") +
  scale_fill_manual(values = cmap_compartment2) +
  facet_grid(category ~ ., space = "free_y", scales = "free_y") + 
  ylab(NULL) + xlab(NULL) + ggtitle("% hits \nby compartment") +
  theme(panel.grid = element_blank(),
        panel.border = element_blank(),
        strip.background = element_blank(),
        # axis.text.y = element_blank(),
        # axis.ticks.y = element_blank(),
        axis.text.x = element_text(size = 10),
        legend.position = "bottom") +
  ggplot2::guides(fill=guide_legend(ncol=2))
## `summarise()` has grouped output by 'annotation_broad', 'compartment'. You can
## override using the `.groups` argument.
# plot breakdown of total hits based on organ
p5 <- hits_annotated %>%
  dplyr::select(annotation_broad, organ, n, category) %>%
  group_by(annotation_broad, organ, category) %>%
  summarize(n_hits = sum(n)) %>%
  mutate(annotation_broad = factor(annotation_broad, levels = motif_order2)) %>%
  ggplot(aes(y = annotation_broad, x = n_hits)) +
  geom_col(aes(fill = organ), position = "fill") +
  scale_fill_manual(values = cmap_organ) +
  facet_grid(category ~ ., space = "free_y", scales = "free_y") + 
  ylab(NULL) + xlab(NULL) + ggtitle("% hits \nby compartment") +
  theme(panel.grid = element_blank(),
        panel.border = element_blank(),
        strip.background = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_text(size = 10),
        legend.position = "bottom") +
  ggplot2::guides(fill=guide_legend(ncol=2))
## `summarise()` has grouped output by 'annotation_broad', 'organ'. You can
## override using the `.groups` argument.
p6 <- motifs_compiled_unique %>%
  filter(motif_name %in% motifs_filt$motif_name) %>% 
  mutate(motif_name = factor(motif_name, levels = motif_order2)) %>%
  ggplot(aes(y = motif_name, x = "1")) +
  # color scale is on log10
  geom_tile(aes(fill = log10(total_hits)), alpha = 1) +
  # labels are non log-transformed
  geom_text(aes(label = scales::comma(total_hits)), color = "white", size = 3) +
  # don't include the very lightest colors; too hard to read
  scale_fill_gradientn(colors = viridis::plasma(100)[0:95], limits = c(0, 8)) +
  facet_grid(category ~ ., space = "free_y", scales = "free_y") + 
  ylab(NULL) + xlab(NULL) + ggtitle("total # hits") +
  rotate_x() +
  hide_ticks() +
  theme(panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.position = "bottom")

# plot_grid(p4 + theme(strip.text = element_blank()), p5, p6, nrow = 1, align = "h", axis = "tb", rel_widths = c(2, 1, 1))

Get logos:

# get motifs in the new order
cwm_list_subset <- cwm_list[motifs_to_plot$motif_name] %>% 
  map(trim_cwm, flank = 2)

# use the annotation_broad name instead of the motif name
# base_representative$motif_name_short <- gsub("^[^|]*\\|([^#]*)#.*$", "\\1", base_representative$motif_name)
names(cwm_list_subset) <- plyr::mapvalues(names(cwm_list_subset),
                                          from = motifs_to_plot$motif_name,
                                          to = as.character(motifs_to_plot$annotation_broad))

hetero_specific <- coop_subset %>% filter(category == "heterocomposite") %>% pull(motif_name)
homo_specific <- coop_subset %>% filter(category == "homocomposite") %>% pull(motif_name)

p_logos_homo <- ggplot() +
  geom_logo(cwm_list_subset[rev(motif_order2[motif_order2 %in% homo_specific])], method = "custom", font = "roboto_bold") +
  theme_logo() +
  facet_grid(seq_group ~ ., scales = "free_y", switch = "both") +
  theme(strip.text.y.left = element_text(angle = 0, hjust = 1),
        # reduce margins between facets, so that the logos can take up
        # a little bit more space
        panel.spacing=unit(0, "lines")
  ) +
  hide_ticks() + ylab(NULL)

p_logos_hetero <- ggplot() +
  geom_logo(cwm_list_subset[rev(motif_order2[motif_order2 %in% hetero_specific])], method = "custom", font = "roboto_bold") +
  theme_logo() +
  facet_grid(seq_group ~ ., scales = "free_y", switch = "both") +
  theme(strip.text.y.left = element_text(angle = 0, hjust = 1),
        # reduce margins between facets, so that the logos can take up
        # a little bit more space
        panel.spacing=unit(0, "lines")
  ) +
  hide_ticks() + ylab(NULL)

p_logos <- plot_grid(p_logos_homo, p_logos_hetero, ncol = 1, align = "v", axis = "rl", rel_heights = c(1, length(hetero_specific)/length(homo_specific)))
p_palindromic <- all_results_best %>%
  filter(result %in% c("hard", "hard & soft")) %>% 
  dplyr::select(motif_name, category, seqA_palindrome, seqB_palindrome) %>% 
  mutate(category = factor(category, levels = c("homocomposite", "heterocomposite", "not cooperative"))) %>% 
  gather(seq, palindrome, seqA_palindrome, seqB_palindrome) %>% 
  mutate(motif_name = factor(motif_name, levels = motif_order2)) %>%
  ggplot(aes(x = seq, y = motif_name)) +
  geom_tile(aes(fill = palindrome), alpha = 0.8) +
  geom_text(aes(label = palindrome), fontface = "bold", size = 4, color = "black") +
  facet_grid(category ~ ., scales = "free_y", space = "free_y") + 
  scale_fill_manual(values = c("P" = "#2bad0a", "Q" = "#8fe37b", "-" = "gray90")) + #, aesthetics = c("color", "fill")) +
  ggtitle("palindromic") +
  no_legend() +
  ylab(NULL) +
  xlab(NULL) +
  theme(panel.grid = element_blank(),
        panel.border = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        strip.background = element_blank())

plot_grid(p_logos,
          p_palindromic + theme(strip.text = element_blank()),
          p6 + theme(strip.text = element_blank()),
          p2 + theme(strip.text.y = element_blank()),
          p4 + theme(strip.text = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank()),
          p5, nrow = 1, rel_widths = c(1.5, 0.2, 0.2, 2, 0.5, 0.5), rel_heights = c(1.5, 1, 1, 1, 1, 1, 1), align = "h", axis = "tb")

7.6 Curves for some examples

plot_curves <- function(motif, facet = TRUE, limits = NULL, best_orientation = FALSE, palette = NULL) {
  
  solo_stats <- solo_df %>% filter(orientation == "A + B" & motif_name == motif)
  
  if (best_orientation) {
    
    keep_orientation <- result_outs %>% filter(motif_name == motif) %>% pull(best_orientation)
    plot_df <- spacing_df %>% filter(motif_name == motif & orientation == keep_orientation)
    
  } else {
    
    plot_df <- spacing_df %>% filter(motif_name == motif)
  }
  
  p1 <- plot_df %>% 
    ggplot(aes(x = spacing)) +
    geom_hline(yintercept = solo_stats$effect_mean, color = "gray60", size = 2) +
    geom_line(aes(y = effect_mean, color = orientation), alpha = 0.7, size = 2) +
    geom_ribbon(aes(ymin = effect_mean - effect_sd, ymax = effect_mean + effect_sd, fill = orientation), alpha = 0.2) +
    annotate("text", label = "A + B", x = 190, y = solo_stats$effect_mean + 0.05, size = 6) +
    ylab("Marginal joint effect") + xlab("distance between motifs (bp)") +
    ggtitle(motif)
  
  # scale_color_manual(values = palette_orientation_hetero) +
  # scale_fill_manual(values = palette_orientation_hetero) +
  
  if (!is.null(limits)) p1 <- p1 + ylim(limits)
  if (facet) p1 <- p1 + facet_wrap(~ orientation, ncol = 1) + no_legend()
  if (!is.null(palette)) p1 <- p1 + scale_color_manual(values = palette)
  
  p1
  
}

This also lets us appreciate that true palindromes have identical outcomes because we’re literally doing the exact same experiment.

plot_curves("430|RUNX_TCF7L/LEF",   limits = c(0, 0.6), facet = FALSE, best_orientation = TRUE, palette = "red")

plot_curves("74|BHLH_HD:PBX1/PDX1", limits = c(0, 0.6), facet = FALSE, best_orientation = TRUE, palette = "red")

plot_curves("80|BHLH_MEF2#1",       limits = c(0, 0.6), facet = FALSE, best_orientation = TRUE, palette = "red")

plot_curves("423|RFX_SOX",          limits = c(0, 0.6), facet = FALSE, best_orientation = TRUE, palette = "red")

7.7 Composite motif 47 (Coordinator)

pred_profiles_47 <- data.table::fread(
  file.path(chrombpnet_dir, "03-syntax/04b/in_silico_marginalization/47|BHLH:TWIST.ZBTB_HD/profile_preds_A,B HT.tsv"),
  data.table = FALSE)

pred_profiles_47 %>% 
  ggplot(aes(x = position, y = mean)) +
  geom_line(color = "red") +
  geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd), fill = "red", alpha = 0.3) +
  facet_grid(spacing ~ .) +
  scale_y_continuous(breaks = c(0.1, 0.5)) +
  scale_x_continuous(labels = c(-500, -250, 0, 250, 500)) +
  theme(panel.spacing.y = unit(0.1, "lines"))

spacing_df %>% 
  filter(motif_name == "47|BHLH:TWIST/ZBTB_HD") %>% 
  filter(spacing <= 20) %>% 
  ggplot(aes(x = spacing, y = effect_mean)) +
  geom_bar(stat = "identity", color = "black", fill = "red") +
  geom_errorbar(aes(ymin = effect_mean - effect_sd, ymax = effect_mean + effect_sd), width = 0.5) +
  facet_wrap(~ orientation, ncol = 2)

7.8 Composite motif 74

pred_profiles_74 <- data.table::fread(
  file.path(chrombpnet_dir, "03-syntax/04b/in_silico_marginalization/74|BHLH_HD:PBX1.PDX1/profile_preds_B,A HT.tsv"),
  data.table = FALSE)

pred_profiles_74 %>% 
  ggplot(aes(x = position, y = mean)) +
  geom_line(color = "red") +
  geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd), fill = "red", alpha = 0.3) +
  facet_grid(spacing ~ .) +
  scale_y_continuous(breaks = c(0.1, 0.5)) +
  scale_x_continuous(labels = c(-500, -250, 0, 250, 500)) +
  theme(panel.spacing.y = unit(0.1, "lines"))

spacing_df %>% 
  filter(motif_name == "74|BHLH_HD:PBX1/PDX1") %>% 
  filter(spacing <= 20) %>% 
  ggplot(aes(x = spacing, y = effect_mean)) +
  geom_bar(stat = "identity", color = "black", fill = "red") +
  geom_errorbar(aes(ymin = effect_mean - effect_sd, ymax = effect_mean + effect_sd), width = 0.5) +
  facet_wrap(~ orientation, ncol = 2)

8 Interactive effects per motif

This section allows for exploring the outputs of the in silico experiments interactively.

8.0.1 Heterocomposites

8.0.2 Homocomposites

9 Session info

.libPaths()
## [1] "/oak/stanford/groups/wjg/sjessa/projects/HDMA/firstpass/renv/library/R-4.1/x86_64-pc-linux-gnu"
## [2] "/share/software/user/open/R/4.1.2/lib64/R/library"                                             
## [3] "/oak/stanford/groups/akundaje/sjessa/software/R"
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_4.2.4         RColorBrewer_1.1-3     ggrastr_1.0.1         
##  [4] magrittr_2.0.3         GenomicFeatures_1.46.5 AnnotationDbi_1.56.2  
##  [7] Biobase_2.54.0         GenomicRanges_1.46.1   GenomeInfoDb_1.30.1   
## [10] IRanges_2.28.0         S4Vectors_0.32.4       BiocGenerics_0.40.0   
## [13] shiny_1.7.5            reactable_0.4.4        plotly_4.10.4         
## [16] TFBSTools_1.32.0       universalmotif_1.12.4  ggseqlogo_0.1         
## [19] pheatmap_1.0.12        cowplot_1.1.3          ggrepel_0.9.3         
## [22] stringr_1.5.0          purrr_1.0.2            glue_1.8.0            
## [25] scales_1.3.0           readr_2.1.4            ggplot2_3.5.1         
## [28] tidyr_1.3.0            dplyr_1.1.3            here_1.0.1            
## 
## loaded via a namespace (and not attached):
##   [1] BiocFileCache_2.2.1         plyr_1.8.8                 
##   [3] lazyeval_0.2.2              crosstalk_1.2.0            
##   [5] BiocParallel_1.28.3         digest_0.6.33              
##   [7] htmltools_0.5.6             viridis_0.6.4              
##   [9] GO.db_3.14.0                fansi_1.0.4                
##  [11] memoise_2.0.1               BSgenome_1.62.0            
##  [13] tzdb_0.4.0                  Biostrings_2.62.0          
##  [15] annotate_1.72.0             matrixStats_1.0.0          
##  [17] R.utils_2.12.2              vroom_1.6.3                
##  [19] prettyunits_1.1.1           colorspace_2.1-0           
##  [21] blob_1.2.4                  rappdirs_0.3.3             
##  [23] xfun_0.40                   crayon_1.5.2               
##  [25] RCurl_1.98-1.12             jsonlite_1.8.7             
##  [27] TFMPvalue_0.0.9             gtable_0.3.4               
##  [29] zlibbioc_1.40.0             XVector_0.34.0             
##  [31] DelayedArray_0.20.0         DBI_1.1.3                  
##  [33] Rcpp_1.0.11                 viridisLite_0.4.2          
##  [35] xtable_1.8-4                progress_1.2.2             
##  [37] bit_4.0.5                   DT_0.29                    
##  [39] htmlwidgets_1.6.2           httr_1.4.7                 
##  [41] ellipsis_0.3.2              farver_2.1.1               
##  [43] pkgconfig_2.0.3             XML_3.99-0.14              
##  [45] R.methodsS3_1.8.2           sass_0.4.7                 
##  [47] dbplyr_2.3.3                utf8_1.2.3                 
##  [49] labeling_0.4.3              tidyselect_1.2.0           
##  [51] rlang_1.1.1                 reshape2_1.4.4             
##  [53] later_1.3.1                 munsell_0.5.0              
##  [55] tools_4.1.2                 cachem_1.0.8               
##  [57] cli_3.6.1                   DirichletMultinomial_1.36.0
##  [59] generics_0.1.3              RSQLite_2.3.1              
##  [61] evaluate_0.21               fastmap_1.1.1              
##  [63] yaml_2.3.7                  fs_1.6.3                   
##  [65] knitr_1.43                  bit64_4.0.5                
##  [67] caTools_1.18.2              KEGGREST_1.34.0            
##  [69] mime_0.12                   R.oo_1.25.0                
##  [71] poweRlaw_0.70.6             pracma_2.4.2               
##  [73] xml2_1.3.5                  biomaRt_2.50.3             
##  [75] compiler_4.1.2              rstudioapi_0.15.0          
##  [77] beeswarm_0.4.0              filelock_1.0.2             
##  [79] curl_5.0.2                  png_0.1-8                  
##  [81] tibble_3.2.1                bslib_0.5.1                
##  [83] stringi_1.7.12              highr_0.10                 
##  [85] lattice_0.20-45             CNEr_1.30.0                
##  [87] Matrix_1.6-3                vctrs_0.6.3                
##  [89] pillar_1.9.0                lifecycle_1.0.3            
##  [91] jquerylib_0.1.4             data.table_1.14.8          
##  [93] bitops_1.0-7                httpuv_1.6.11              
##  [95] rtracklayer_1.54.0          R6_2.5.1                   
##  [97] BiocIO_1.4.0                promises_1.2.1             
##  [99] gridExtra_2.3               vipor_0.4.5                
## [101] MASS_7.3-60                 gtools_3.9.4               
## [103] seqLogo_1.60.0              SummarizedExperiment_1.24.0
## [105] rprojroot_2.0.3             rjson_0.2.21               
## [107] withr_2.5.0                 GenomicAlignments_1.30.0   
## [109] Rsamtools_2.10.0            GenomeInfoDbData_1.2.7     
## [111] parallel_4.1.2              hms_1.1.3                  
## [113] rmarkdown_2.24              MatrixGenerics_1.6.0       
## [115] ggbeeswarm_0.7.2            restfulr_0.0.15